library(dplyr)
library(ggplot2)
library(knitr)
library(broom)
library(tidyr)
library(progress)
library(pbapply)
pboptions(type='none')
library(dbscan)
library(gridExtra)
In the standard conformal methods for regression – i.e., the ones which employ standard nonconformity measures – the resulting predictive regions have more or less the same width for all examples in the test set. In many cases, it would be more natural for the size of the regions to vary according to how “hard” it is to predict each example. It is possible to define convenient nonconformity measures, which result in predictive regions with varying width depending on the expected accuracy of the algorithm on each example. As a result, the predictive regions output by this adaptive methodology are typically much tighter than those produced by the simple regression measure.
Function for data generation
sample_data <- function(n, seed)
{
set.seed(seed)
x <- seq(-3, 3, length.out = n)
sigma_1 <- 0.3
sigma_2 <- 1
noise_sd <- sigma_1 + sigma_2 * exp(-x^2 / 2)
y <- 2 * x + rnorm(n, sd = noise_sd)
out <- list("x" = x,
"y" = y)
return(out)
}
We consider again a simple regression example.
seed <- 3
n <- 1000
data <- sample_data(n, seed)
x <- data$x
y <- data$y
par(mfrow=c(1,2))
hist(y, breaks=10, col='lightblue', main="Histogram of y values")
plot(x, y, col='lightblue', main="Data")
# Fit linear model with ols
mod <- lm(y ~ x, data)
# Display the regression function
x.grid <- data.frame("x"=seq(range(data$x)[1],range(data$x)[2],length.out=1000))
y.grid.hat <- predict(mod, x.grid)
par(mfrow=c(1,1))
plot(x, y, col='lightblue', main="Data and linear regression function")
lines(x.grid$x, y.grid.hat, type='l', col='black', lwd=2)
Function for evaluating the predictions
evaluate_predictions <- function(x.test, y.test, x.grid, lower, upper, view_plot=T){
covered <- (y.test >= lower)*(y.test<=upper)
coverage <- mean(covered)
width <- mean(upper-lower)
if(view_plot){
idx.sort <- sort(x.test$x, index.return=TRUE)$ix
plot(x.test$x[idx.sort], y.test[idx.sort], col='lightblue', main=paste0("Prediction interval, alpha=",alpha, ", coverage=", round(coverage,2), ", width=", round(width,2)),
xlab="x test", ylab="y test")
lines(x.test$x[idx.sort], lower[idx.sort], lty=3, col='black', lwd=2)
lines(x.test$x[idx.sort], upper[idx.sort], lty=3, col='black', lwd=2)
}
out <- list("coverage"=coverage,
"width"=width)
return(out)
}
# Build a test set for the evaluations
n.test <- 1000
data.test <- sample_data(n.test, seed+n)
x.test <- data.test$x
y.test <- data.test$y
x.test <- data.frame("x"=x.test)
#' Compute conformal prediction interval
reg_split_conformal <- function(x, y, x.test, alpha){
n <- length(y)
n.train <- floor(n/2)
n.calib <- n-n.train
# Split the data into training and calibrations sets
idxs.train <- sample(1:n, size=n.train)
x.train <- x[idxs.train]
y.train <- y[idxs.train]
x.calib <- x[-idxs.train]
y.calib <- y[-idxs.train]
data <- data.frame("x"=x.train, "y"=y.train)
mod <- lm(y~x, data)
ncs.calib <- abs(y.calib - predict(mod, data.frame("x"=x.calib)))
ncs.quantile <- quantile(ncs.calib, prob=(1-alpha)*(n.calib+1)/n.calib)
# Construct the prediction interval
x.test <- data.frame("x"=x.test)
y.hat <- predict(mod, x.test)
lower_bound <- y.hat - ncs.quantile
upper_bound <- y.hat + ncs.quantile
out <- list("lower_bound"=lower_bound,
"upper_bound"=upper_bound)
}
# Nominal significance level
alpha <- .1
pi.conformal <- reg_split_conformal(x, y, x.test, alpha)
performance <- evaluate_predictions(x.test, y.test, lower=pi.conformal$lower_bound, upper=pi.conformal$upper_bound)
#' Compute conformal prediction interval
reg_split_normalized_conformal <- function(x, y, x.test, alpha){
n <- length(y)
n.train <- floor(n/2)
n.calib <- n-n.train
# Split the data into training and calibrations sets
idxs.train <- sample(1:n, size=n.train)
x.train <- x[idxs.train]
y.train <- y[idxs.train]
x.calib <- x[-idxs.train]
y.calib <- y[-idxs.train]
data <- data.frame("x"=x.train, "y"=y.train)
mod <- lm(y~x, data)
## Estimate how difficult it is to predict points on the calibration set
epsilon.train <- abs(mod$residuals)
log_residuals <- log(epsilon.train + 1e-6)
difficulty_model <- lm(log_residuals ~ I(x^2), data)
y.calib.hat <- predict(mod, newdata = data.frame("x" = x.calib))
residuals <- abs(y.calib - y.calib.hat)
# Predict sigma on the calibration set
sigma.calib <- exp(predict(difficulty_model, newdata = data.frame("x" = x.calib)))
ncs.calib <- residuals / sigma.calib
ncs.quantile <- quantile(ncs.calib, prob=(1-alpha)*(n.calib+1)/n.calib)
# Construct the prediction interval
y.test.hat <- predict(mod, newdata = data.frame("x" = x.test))
sigma.test.hat <- exp(predict(difficulty_model, newdata = data.frame("x" = x.test)))
lower_bound <- y.test.hat - ncs.quantile * sigma.test.hat
upper_bound <- y.test.hat + ncs.quantile * sigma.test.hat
out <- list("lower_bound"=lower_bound,
"upper_bound"=upper_bound)
}
# Nominal significance level
alpha <- .1
pi.conformal <- reg_split_normalized_conformal(x, y, x.test, alpha)
performance <- evaluate_predictions(x.test, y.test, lower=pi.conformal$lower_bound, upper=pi.conformal$upper_bound)
The ConformalInference R package allows you to
automatically generate conformal prediction intervals - both in full and
split conformal framework - from very complex regression model.
# devtools::install_github(repo="ryantibs/conformal", subdir="conformalInference")
library(conformalInference)
The ConformalInference package has a “functional programming” soul, in the sense that one can “extract” the training and the prediction function of the considered model. The model must not be necessary linear. On the contrary, the package allows also to exploit non-linear prediction models.
Let us load the dataset that we will use to cover this part.
load('nlr_data.rda')
attach(Prestige)
Prestige is a dataset coming from the car package containing 102 observations of 6 variables. Each row is an observation that relates to an occupation (e.g. chemists, general managers, physicists,…). The columns relate to predictors such as average years of education, percentage of women in the occupation, average income, prestige of the occupation, etc.
Multivariate linear regression model
Let us start from a practical example. If we wanted to construct prediction intervals for forecasts made with a multivariate linear regression model, we would need to:
model_poly <- lm(prestige ~ poly(income, degree=2))
# Matrix of the features of the observed dataset
X <- matrix(poly(income, degree=2), ncol=2)
# Vector of responses of the observed dataset
y <- prestige
# Matrix of features of the new test point
x0 <- matrix(data=c(0.2,0.2), nrow=1, ncol=2)
# Function to perform model training
lm_train = lm.funs(intercept = T)$train.fun
# Function to perform point prediction
lm_predict = lm.funs(intercept = T)$predict.fun
Then we give (i) the matrix of observed features, (ii) the observed
response variable and (iii) the matrix of features of the test set, (iv)
the functions used for training the model and for prediction, as input
to the conformal.pred function, namely the function that
builds the conformal PI in a full-conformal framework.
alpha <- 0.1
c_preds <- conformal.pred(x=X, y=y, x0=x0, alpha=alpha,
verbose=F, train.fun=lm_train, predict.fun=lm_predict,
num.grid.pts = 200)
sprintf("Point prediction: %f", c_preds$pred)
## [1] "Point prediction: 14.643633"
sprintf("Lower bound: %f", c_preds$lo)
## [1] "Lower bound: -3.296273"
sprintf("Upper bound: %f", c_preds$up)
## [1] "Upper bound: 32.709171"
Now we want to have a visualization of the conformal prediction intervals for many possible values of income. To do so, we need to specify as the matrix of features of the test set the evaluations of the predictors over a grid of income values. We proceed as follows:
#' In this case, since we want to have a visualization of the conformal
#' prediction intervals for many points of the domain, our matrix of features
#' of the test set will be made by the evaluations of the predictors over a grid.
income.grid <- seq(range(income)[1],range(income)[2],by=100)
X_test_grid <- matrix(poly(income.grid,degree=2,
coefs=attr(poly(income,degree=2),"coefs")),
ncol=2)
Now we can compute the point prediction and the lower and upper
bounds of the conformal intervals of each point of the test points in
X_test_grid.
alpha <- 0.1
c_preds <- conformal.pred(x=X, y=y, x0=X_test_grid, alpha=alpha,
verbose=F, train.fun=lm_train, predict.fun=lm_predict,
num.grid.pts = 200)
plot(income, prestige, xlim=range(income.grid), cex =.5, col ="lightblue",
main='Polynomial Regression')
lines(income.grid, c_preds$pred, lwd=2, col ="black", lty=1)
matlines(income.grid, cbind(c_preds$up,c_preds$lo), lwd=2, col="black",lty=2)
In order to compute the PI in a split conformal framework, one can
use the conformal.pred.split function.
c_preds_split <- conformal.pred.split(x=X, y=y, x0=X_test_grid, alpha=alpha,
verbose=F,
train.fun=lm_train,
predict.fun=lm_predict,
rho=0.5)
plot(income, prestige, xlim=range(income.grid), cex =.5, col ="lightblue",
main='Polynomial Regression')
lines(income.grid, c_preds_split$pred, lwd=2, col ="black", lty=1)
matlines(income.grid, cbind(c_preds_split$up,c_preds_split$lo), lwd=2, col="black", lty=2)
Of course, beware that every execution of the
conformal.pred.split function will build prediction
intervals that differ in principle.
More generally, whenever a point predictor can be formulated as a linear regression model, everything can be done in the same exact fashion. The only thing to pay attention to is the definition of the design matrix.
Splines
library(splines)
breaks <- c(quantile(income, probs=c(0.2,0.4,0.6,0.8)), 15000)
model_cut <- lm(prestige ~ bs(income, degree=3, knots=breaks))
# Matrix of features of the observed data
X <- bs(income, degree=3, knots=breaks)
# Vector of responses of the observed dataset
y <- prestige
# And evaluate the predictors over a grid
X_test_grid <- matrix(bs(income.grid, degree=3, knots=breaks), nrow=length(income.grid))
# Function to perform model training
lm_train = lm.funs(intercept = T)$train.fun
# Function to perform point prediction
lm_predict = lm.funs(intercept = T)$predict.fun
Then everything works just as before.
c_preds <- conformal.pred(x=X, y=y, x0=X_test_grid,
alpha=alpha, verbose=F, train.fun = lm_train,
predict.fun=lm_predict, num.grid.pts=200)
plot(income, prestige, xlim=range(income.grid), cex=.5, col ="lightblue",
main='Spline Regression')
lines(income.grid, c_preds$pred, lwd=2, col="black", lty=1)
matlines(income.grid, cbind(c_preds$up,c_preds$lo), lwd=1, col="black", lty=2)
In a “nonlm” case, custom functions for the point predictor need to be created, In particular, we need to explicitly define the prediction model objects, which will be used to output predictions.
Smoothing splines
Let us first start with smoothing splines.
fit <- smooth.spline(income, prestige)
opt <- fit$df
Now define the function for training the point predictor and the function actually used for prediction.
train_ss <- function(x, y, out=NULL){
smooth.spline(x, y, df=opt)
}
predict_ss <- function(obj, new_x){
predict(obj, new_x)$y
}
# Example of use
# trained_pred <- train_ss(income, prestige)
# predict_ss(trained_pred, income)
These ad-hoc functions can be given as input to the
conformal.pred function:
c_preds <- conformal.pred(x=income, y=prestige, x0=income.grid,
alpha=alpha, verbose=F, train.fun=train_ss, predict.fun=predict_ss,
num.grid.pts=200)
plot(income, prestige, cex =.5, col ="lightblue", main='Smoothing spline')
lines(income.grid, c_preds$pred, lwd=2, col ="black", lty=1)
matlines(income.grid, cbind(c_preds$up,c_preds$lo), lwd=2, col ="black", lty=2)
GAMs
library(mgcv)
model_gam <- gam(prestige ~ s(education, bs='cr') + s(income, bs='cr'))
# Evaluate the predictors on a grid
education.grid <- seq(range(education)[1], range(education)[2], length.out=100)
income.grid <- seq(range(income)[1], range(income)[2], length.out=100)
grid <- expand.grid(education.grid, income.grid)
names(grid) <- c('education', 'income')
pred <- predict(model_gam, newdata=grid)
library(rgl)
persp3d(education.grid, income.grid, pred, col='forestgreen')
points3d(education, income, prestige, col='lightblue', size=5)
Again, we manually define the function to train the point predictor and the function used for prediction.
train_gam <- function(x, y, out=NULL){
colnames(x) <- c('var1','var2')
train_data <- data.frame(y, x)
model_gam <- gam(y ~ s(var1, bs='cr') + s(var2, bs='cr'), data=train_data)
}
predict_gam <- function(obj, new_x){
new_x <- data.frame(new_x)
colnames(new_x) <- c('var1', 'var2')
predict.gam(obj, new_x)
}
And we generate our predictions, just as before. In this case, let us do the predictions for just one test point, which has as values of education and income the median education and the median income of the observed points.
We can do it in a full conformal framework
# Full conformal framework
c_preds <- conformal.pred(x=cbind(education,income), y=prestige,
x0=c(median(education),median(income)),
alpha=alpha, verbose=F,
train.fun=train_gam, predict.fun=predict_gam,
num.grid.pts=200)
sprintf("Point prediction: %f", c_preds$pred)
## [1] "Point prediction: -2.348590"
sprintf("Lower bound: %f", c_preds$lo)
## [1] "Lower bound: -14.452889"
sprintf("Upper bound: %f", c_preds$up)
## [1] "Upper bound: 10.395938"
and in a split conformal framework
# Split conformal framework
c_preds_split <- conformal.pred.split(x=cbind(education,income), y=prestige,
x0=c(median(education),median(income)),
alpha=alpha, verbose=F,
train.fun=train_gam,
predict.fun=predict_gam,
rho = 0.5)
sprintf("Point prediction: %f", c_preds_split$pred)
## [1] "Point prediction: -2.257108"
sprintf("Lower bound: %f", c_preds_split$lo)
## [1] "Lower bound: -15.652509"
sprintf("Upper bound: %f", c_preds_split$up)
## [1] "Upper bound: 11.138294"
The conformalInference.multi package plays the
multivariate counterpart of conformalInference.
#install.packages('conformalInference.multi')
library(conformalInference.multi)
Let’s generate some multivariate data
n = 25
p = 4
q = 2
mu = rep(0,p)
x = mvtnorm::rmvnorm(n, mu)
beta = sapply(1:q, function(k) c(mvtnorm::rmvnorm(1,mu)))
y = x%*%beta + t(mvtnorm::rmvnorm(q,1:(n)))
q = ncol(y)
x0 = x[n,]
y0 = y[n,]
n0 = nrow(y0)
Let’s declare the fitting and prediction function
fun=mean_multi()
And then run the usual full conformal algorithm
final.full <- conformal.multidim.full(x, y, x0, fun$train.fun,
fun$predict.fun, score="l2",
num.grid.pts.dim=100, grid.factor=1.25,
verbose=FALSE)
plot_multidim(final.full)
## [[1]]
I can customise the ncm, of course
final.full <- conformal.multidim.full(x, y, x0, fun$train.fun,
fun$predict.fun, score="max",
num.grid.pts.dim=100, grid.factor=1.25,
verbose=FALSE)
plot_multidim(final.full)
## [[1]]
First, we load the required packages and the data.
library(fda)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: fds
## Loading required package: rainbow
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: pcaPP
## Loading required package: RCurl
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
## Loading required package: deSolve
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
library(roahd)
##
## Attaching package: 'roahd'
## The following object is masked from 'package:fda':
##
## fbplot
library(conformalInference.fd)
##
## Attaching package: 'conformalInference.fd'
## The following object is masked from 'package:conformalInference.multi':
##
## computing_s_regression
data=growth #data from the berkeley growth study
matplot(data$age, data$hgtm, type='l',col='forestgreen', xlab="Age", ylab="Height",
ylim=c(60,200))
matlines(data$age, data$hgtf, type='l',col='darkorange', xlab="Age", ylab="Height")
We aim to generate prediction bands for a new female and for a new male. To do so, we need to separate the datasets separately conduct conformal inference.
berkeley_f=t(data$hgtf)
berkeley_m=t(data$hgtm)
par(mfrow=c(1,2))
matplot(data$age, data$hgtf, type='l',col='darkorange', xlab="Age", ylab="Height",
main="Females", ylim=c(60,200))
matplot(data$age, data$hgtm, type='l',col='forestgreen', xlab="Age", ylab="Height",
main="Male", ylim=c(60,200))
We proceed in a split conformal setting.
We first identify the upper and the lower bounds of the prediction region for the females.
dataset <- berkeley_f
x.grid <- growth$age
f_data <- fData(x.grid, dataset)
rho <- 0.5 # train and calibration split ratio
alpha = 0.1
l <- dim(dataset)[1] - ceiling(dim(dataset)[1] * rho)
alpha_rag <- c(seq(1/(l+1), l/(l+1), 1/(l+1)))
fun=mean_lists()
x0=list(as.list(x.grid))
band <- conformal.fun.split(NULL, NULL, f_data, NULL, x0, fun$train.fun, fun$predict.fun,
alpha = alpha,
split=NULL, seed=2024, randomized=FALSE, seed.rand=FALSE,
verbose=FALSE, rho=rho, s.type="st-dev")
## [1] "Since x is not provided, the method used for prediction will be the mean \n"
upper_b_f <- band$up[[1]][[1]]
lower_b_f <- band$lo[[1]][[1]]
point_pred_f <- band$pred[[1]][[1]]
And then we do the same steps for the males.
dataset <- berkeley_m
x.grid <- growth$age
f_data <- fData(x.grid, dataset)
rho <- 0.5 # train and calibration split ratio
alpha = 0.1
l <- dim(dataset)[1] - ceiling(dim(dataset)[1] * rho)
alpha_rag <- c(seq(1/(l+1), l/(l+1), 1/(l+1)))
fun=mean_lists()
x0=list(as.list(x.grid))
band <- conformal.fun.split(NULL, NULL, f_data, NULL, x0, fun$train.fun, fun$predict.fun,
alpha = alpha,
split=NULL, seed=2024, randomized=FALSE, seed.rand=FALSE,
verbose=FALSE, rho=rho, s.type="st-dev")
## [1] "Since x is not provided, the method used for prediction will be the mean \n"
upper_b_m <- band$up[[1]][[1]]
lower_b_m <- band$lo[[1]][[1]]
point_pred_m <- band$pred[[1]][[1]]
Lastly, we plot the resulting prediction regions for a new female (left) and for a new male (right).
par(mfrow=c(1,2))
plot(x.grid, upper_b_f, type='l', col="black", lty=2, xlab="Age", ylab="Height", lwd=1.5,
ylim=c(60,200),
main="Functional CP band for a female")
lines(x.grid, point_pred_f, type='l', col="darkorange", lwd=1.5)
lines(x.grid, lower_b_f, type='l', col="black", lty=2, lwd=1.5)
plot(x.grid, upper_b_m, type='l', col="black", lty=2, xlab="Age", ylab="Height", lwd=1.5,
ylim=c(60,200),
main="Functional CP band for a male")
lines(x.grid, point_pred_m, type='l', col="forestgreen", lwd=1.5)
lines(x.grid, lower_b_m, type='l', col="black", lty=2, lwd=1.5)
Final remark
Let us just add a final comment on this. If instead of directly using
the conformalInference.fd package we wanted to manually
build the conformal prediction region, we would use the following
commands.
alpha = 0.1
n = nrow(berkeley_m)
i1 = sample(1:n,n/2)
t_set = berkeley_m[i1,] #training test
c_set = berkeley_m[-i1,] #calibration set
n.cal = dim(c_set)[1]
mu = colMeans(t_set)
Mu = matrix(mu, nrow=n.cal, ncol=length(mu), byrow=TRUE)
res = abs(c_set - Mu)
ncm = apply(res, 1, max) #max res for each age
ncm.quantile = quantile(ncm, prob=(1-alpha)*(n.cal+1)/n.cal)
upper_b = mu + ncm.quantile
lower_b = mu - ncm.quantile
With this method, we are not including the possibility for the conformal prediction region to adapt its width depending on the variability of the curves over the domain. If we wanted to have adaptivity, we could include a modulation function in the definition of the nonconformity measure. This is what we do below.
S = apply(t_set, 2, sd)
res = (c_set-Mu)/S
ncm = apply(res, 1, max) #max res for each age
ncm_sort=c(sort(ncm), Inf) #order res and add inf
ncm.quantile.normalized = ncm_sort[ceiling((n.cal + 1)*(1-alpha))]
#ncm.quantile.normalized = quantile(ncm, prob=(1-alpha)*(n.cal+1)/n.cal)
upper_b_adaptive = mu + ncm.quantile.normalized*S
lower_b_adaptive = mu - ncm.quantile.normalized*S
Now we plot the two conformal prediction regions identified with the classical nonconformity measure (left) and the nonconformity measure with the modulation function (right).
par(mfrow=c(1,2))
plot(x.grid, upper_b, type='l', col="black", lty=2, xlab="Age", ylab="Height",
lwd=1.5, ylim=c(60,200),
main="Non-adaptive functional CP for a male")
lines(x.grid, mu, type='l', col="forestgreen", lwd=1.5)
lines(x.grid, lower_b, type='l', col="black", lty=2, lwd=1.5)
plot(x.grid, upper_b_adaptive, type='l', col="black", lty=2, xlab="Age", ylab="Height",
lwd=1.5, ylim=c(60,200),
main="Adaptive functional CP for a male")
lines(x.grid, mu, type='l', col="forestgreen", lwd=1.5)
lines(x.grid, lower_b_adaptive, type='l', col="black", lty=2, lwd=1.5)
What we notice is, in fact, that the conformalInference.fd package includes the modulation function (as default) to adapt the width of the region to the variability of the curves in different points of the domain.